---
title: "Study 1 Cleaning"
author: "Kaylyn Jackson Schiff, Daniel Schiff, and Natalia Bueno"
date: "2020"
output: pdf_document
editor_options: 
  chunk_output_type: console
---

#####Setup Chunk
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(rmarkdown)
require(skimr)
require(readr)
require(tidyverse)
options(scipen=999)
```

#####Functions
```{r functions}
set.seed(1234)
calculate_mean_effects_index <- function(Z, outcome_mat, to_reorient, reorient = FALSE, greedy = TRUE,
                                         impute = FALSE){
  if(length(Z) != nrow(outcome_mat)) stop("Error: Treatment assignment, outcome matrix require same n!")
  if(impute == TRUE){
    R <- 1 * is.na(outcome_mat)
    means_for_imputation <- rbind(apply(outcome_mat[Z==0,], MAR = 2, FUN = mean, na.rm = T),
                                  apply(outcome_mat[Z==1,], MAR = 2, FUN = mean, na.rm = T))
    to_impute <- R * means_for_imputation[Z+1,]
    outcome_mat[is.na(outcome_mat)] <- 0
    outcome_mat <- outcome_mat + to_impute
  }
  c_mean <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = mean, na.rm = T)
  c_sd <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = sd, na.rm = T)
  z_score <- t(t(sweep(outcome_mat, 2, c_mean))/ c_sd)
  index_numerator <- rowSums(z_score)
  if(greedy == TRUE){
    n_outcomes <- rowSums(!is.na(z_score))
  }
  else if(greedy == FALSE){
    n_outcomes <- ncol(outcome_mat)
  }
  index <- index_numerator/n_outcomes
  index <-  (index - mean(index[Z==0], na.rm =T))/sd(index[Z==0], na.rm =T)
  return(index)
}
```

#####Import Data
```{r import data}
#Imports data and removes PII variable and saves data for use.
# df_names <- read_csv("data/lucid_sample_2.23.21.csv", n_max = 0) %>% names()
# df_raw <- read_csv("data/lucid_sample_2.23.21.csv", col_names = df_names, skip = 3)
# df <- df_raw

#Excluding zip data
#df <- df %>% select(-zip)
#saveRDS(df, "data/lucid_sample_2.23.21.rds")

df <- readRDS("data/lucid_sample_2.23.21.rds")

```

#####Explore Data
```{r explore data}
df
glimpse(df)
colnames(df)
```

#####Subset Data
```{r subset data}
df <- df %>% dplyr::select(ResponseId,                                                #respondent data
                      screener_1_1:screener_1_21, screener_2_1:screener_2_6,            #screening questions
                      
                      politician,                                                       #politician name
                      video, text,                                                      #media format
                      IU, OR, Control,                                                  #allegations  
                      
                      belief_1, belief_2,                                               #belief outcomes
                      support_1, support_2, support_3, support_4,                       #support outcomes
                      trust_1, trust_2,                                                 #trust outcomes
                      
                      party, political_party, gender, race, age, education_2, hhi, region,               #demographics
                      literacy1, literacy2, literacy3, familiar_df                      #news media and digital literacy
)                 

skim(df)                                                   #summary statistics and class
```

#####Recode and Create Variables
```{r recode and create new variables}
#Create attentiveness index
df <- df %>% mutate(screener_1 = if_else((!is.na(screener_1_21) & screener_1_21 == 1) &
                                                  is.na(screener_1_1) ==  TRUE &
                                                  is.na(screener_1_2) ==  TRUE &
                                                  is.na(screener_1_3) ==  TRUE &
                                                  is.na(screener_1_4) ==  TRUE &
                                                  is.na(screener_1_5) ==  TRUE &
                                                  is.na(screener_1_6) ==  TRUE &
                                                  is.na(screener_1_7) ==  TRUE &
                                                  is.na(screener_1_8) ==  TRUE &
                                                  is.na(screener_1_9) ==  TRUE &
                                                  is.na(screener_1_10) == TRUE &
                                                  is.na(screener_1_11) == TRUE &
                                                  is.na(screener_1_12) == TRUE &
                                                  is.na(screener_1_13) == TRUE &
                                                  is.na(screener_1_14) == TRUE &
                                                  is.na(screener_1_15) == TRUE &
                                                  is.na(screener_1_16) == TRUE &
                                                  is.na(screener_1_17) == TRUE &
                                                  is.na(screener_1_18) == TRUE &
                                                  is.na(screener_1_19) == TRUE &
                                                  is.na(screener_1_20) == TRUE, 1, 0))

df <- df %>% dplyr::mutate(screener_2 = if_else(is.na(screener_2_1) &
                                         is.na(screener_2_2) &
                                         (!is.na(screener_2_3) & screener_2_3 == 1) &
                                         is.na(screener_2_4) &
                                         (!is.na(screener_2_5) & screener_2_5 == 1)&
                                         is.na(screener_2_6), 1, 0))

df <- df %>% dplyr::mutate(attentiveness = rowSums(dplyr::select(df, screener_1, screener_2))) #create index

#Create variables for treatment assignments
df <- df %>% mutate(alleg_treatment = case_when(
  Control == 1 ~ "Control",
  IU == 1 ~ "IU",
  OR == 1 ~ "OR"
))
df$alleg_treatment <- factor(df$alleg_treatment, levels = c("Control","IU","OR"), 
                     labels = c("Control","Info. Uncertain","Opp. Rally"), ordered = FALSE)

df <- df %>% mutate(media_format = case_when(
  video == 1 ~ "Video",
  text == 1 ~ "Text"
))
df$media_format <- factor(df$media_format, levels = c("Text","Video"), 
                     labels = c("Text","Video"), ordered = FALSE)

#Recode political ideology
df <- df %>% mutate(party_3 = case_when(
  party == 1 | party == 2 ~ 1,                 # "Democrat",
  party == 3 | party == 4 | party == 5 ~ 2,    # "Independent",
  party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3 <- factor(df$party_3, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df <- df %>% mutate(party_3_lean = case_when(
  party == 1 | party == 2 | party==3 ~ 1,                 # "Democrat",
  party == 4 ~ 2,    # "Independent",
  party == 5 | party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3_lean <- factor(df$party_3_lean, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df$party <- factor(df$party, levels = c(4,1,2,3,5,6,7), 
                     labels = c("Independent","Strong Democrat","Democrat","Lean Democrat",
                                "Lean Republican","Republican","Strong Republican"), ordered = FALSE)

#Recode pre-treatment political ideology
df <- df %>% mutate(pre_party_3 = case_when(
  political_party %in% c(1,2) ~ 1,                 # "Democrat",
  political_party %in% c(3:8) ~ 2,    # "Independent",
  political_party %in% c(9,10) ~ 3                  # "Republican"
))
df$pre_party_3 <- factor(df$pre_party_3, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df <- df %>% mutate(pre_party_3_lean = case_when(
  political_party %in% c(1,2,3,6) ~ 1,                 # "Democrat",
  political_party %in% c(4,7) ~ 2,    # "Independent",
  political_party %in% c(5,8,9,10) ~ 3                  # "Republican"
))
df$pre_party_3_lean <- factor(df$pre_party_3_lean, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

#Create strong co-partisanship variable
df <- df %>% mutate(copartisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1, #co-partisans
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ 0, #anti-partisans
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 0,
  party_3 == "Independent" ~ 0))

#Create moderate partisanship variable
df <- df %>% mutate(moderate = if_else(party_3 == "Independent", 1, 0))

#Create anti-partisanship variable
df <- df %>% mutate(antipartisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 0,
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ 1,
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 0, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3 == "Independent" ~ 0))

#Create partisanship variable
df <- df %>% mutate(partisan = case_when(
  party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  party_3 == "Independent" ~ 0))

#Create partisanship variable with leaners
df <- df %>% mutate(partisan_lean = case_when(
  party_3_lean == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  party_3_lean == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  party_3_lean == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  party_3_lean == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  party_3_lean == "Independent" ~ 0))

#Create pre-treatment partisanship variable
df <- df %>% mutate(pre_partisan = case_when(
  pre_party_3 == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  pre_party_3 == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  pre_party_3 == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  pre_party_3 == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  pre_party_3 == "Independent" ~ 0))

#Create pre-treatment partisanship variable with leaners
df <- df %>% mutate(pre_partisan_lean = case_when(
  pre_party_3_lean == "Democrat" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ 1,
  pre_party_3_lean == "Democrat" & (politician == "Todd Akin" | politician == "Tim James") ~ -1,
  pre_party_3_lean == "Republican" & (politician == "Todd Akin" | politician == "Tim James") ~ 1, 
  pre_party_3_lean == "Republican" & (politician == "Jesse Jackson" | politician == "John Murtha") ~ -1,
  pre_party_3_lean == "Independent" ~ 0))


#Recode gender
df$gender <- factor(df$gender, levels = c(1,2), 
                     labels = c("Male","Female"), ordered = FALSE)

#Recode race/ethnicity
df$race <- factor(df$race, levels = c(1,2,3,4,5), 
                  labels = c("White", "Black", "Hispanic", "Asian", "Other"), ordered = FALSE)

#Recode age
df <- df %>% mutate(age2 = case_when(
  age >= 2020-1945                    ~ 5,   
  age <= 2020-1946 & age >= 2020-1964 ~ 4,   
  age <= 2020-1965 & age >= 2020-1980 ~ 3,       
  age <= 2020-1981 & age >= 2020-1996 ~ 2, 
  age <= 2020-1997                    ~ 1          
  ))
df$age <- factor(df$age2, levels = c(1:5),
                     labels = c("Gen Z", "Millennials", "Gen X", "Boomers", "Silent"), ordered = FALSE)

#Recode education
df <- df %>% mutate(education = case_when(
  education_2 <= 2 ~ 1,                      #High school graduate or less
  education_2 >= 3 & education_2 <= 4 ~ 2,   #Some college or Associate degree
  education_2 == 5 ~ 3,                      #Bachelor's degree
  education_2 >= 6 ~ 4                       #Graduate degree 
  ))
df$education <- factor(df$education, levels = c(1,2,3,4), 
                  labels = c("High school graduate or less","Some college","Bachelor's degree","Graduate degree"), 
                  ordered = FALSE)

#Recode income
df$hhi <- ifelse(df$hhi == -3105, mean(df$hhi[df$hhi != -3105]), df$hhi) #recode missing values to mean
df <- df %>% mutate(income = case_when(
  hhi <= 4 & hhi >= 1 ~ 1,           #Under $30k
  hhi >= 5 & hhi <= 13 ~ 2,          #$30k to under $75k
  hhi >= 14 ~ 3                      #At least $75k
  ))
df$income <- factor(df$income, levels = c(2,1,3), 
                  labels = c("Middle income","Low income","High income"), 
                  ordered = FALSE)

#Recode region
df$region <- factor(df$region, levels = c(1:4), 
                  labels = c("Northeast","Midwest","South","West"), 
                  ordered = FALSE)

#Create news media literacy index variable
df <- df %>% mutate(literacy1_correct = if_else(literacy1==2, 1, 0),
                          literacy2_correct = if_else(literacy2==4, 1, 0),
                          literacy3_correct = if_else(literacy3==2, 1, 0))
df <- df %>% mutate(media_literacy = 
                      rowSums(dplyr::select(df, literacy1_correct, literacy2_correct, literacy3_correct)))

#Rename digital literacy
df <- df %>% rename(digital_literacy = familiar_df)

#Create index for belief
df <- df %>% mutate(treat = ifelse(alleg_treatment == "Control", 0, 1))
mat <- df %>% dplyr::select(belief_1, belief_2)
df <- df %>% mutate(belief = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Create index for support
mat <- df %>% dplyr::select(support_1, support_2, support_3, support_4)
df <- df %>% mutate(support = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Create index for support without donations
mat <- df %>% dplyr::select(support_1, support_2, support_3)
df <- df %>% mutate(support_nodonation = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Create index for trust
mat <- df %>% dplyr::select(trust_1, trust_2)
df <- df %>% mutate(trust = calculate_mean_effects_index(Z = df$treat, outcome_mat = mat, reorient = F))

#Reselect variables
colnames(df)
df <- df %>% dplyr::select(  ResponseId,                                                #respondent data
                      attentiveness,                                                    #attentiveness index
                      
                      politician,                                                       #politician name
                      media_format,                                                     #media format
                      alleg_treatment,                                                  #allegations 
                      
                      copartisan, moderate, antipartisan, 
                      party_3, party_3_lean, pre_party_3, pre_party_3_lean,
                      partisan, partisan_lean, pre_partisan, pre_partisan_lean,         #partisanship indicators
                      
                      belief, support,support_nodonation, trust,                                           #outcomes
                      belief_1:trust_2,
                      
                      party, gender, race, age, education, income, region,              #demographics
                      media_literacy, digital_literacy                                  #news media and digital literacy
)     

```

#####Export File
```{r export file}
write_rds(df, path = "data/df_clean.rds")
```
